# install and declare the libraries
library(readr)
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(RColorBrewer)
#install.packages("ggalluvial")
library(ggalluvial)
#install.packages("treemapify")
library(treemapify)
#install.packages("treemap")
library(treemap)
library(treemapify)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Problem Statement

@ The person who did 2 questions

Task 1

Generate the density plot similar to what is shown in the assignmnet

# Importing the Dataset
df_airline <- read_csv("/Users/yashjaigude/Documents/MS NEU/IE 6600 Comp Viz/Homework 4/airlines_delay.csv")
## Rows: 155244 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): carrier, carrier_name, airport, airport_name
## dbl (12): year, month, arr_flights, arr_del15, arr_cancelled, arr_diverted, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Creating a copy of a data frame
df_airline_copy <- df_airline

# Obtaining the list of columns
x <- c("carrier_delay","late_aircraft_delay", "nas_delay", "security_delay", "weather_delay")

# Taking the log of the Column
df_airline_copy[x] <- log10(df_airline_copy[x])

# Taking the subset
df_airline_copy <- df_airline_copy[,c("carrier", "carrier_delay", "late_aircraft_delay", "nas_delay", "security_delay", "weather_delay")]

# Ploting the density graph and give the particular labels 
df_airline_copy <- df_airline_copy %>% gather(delay, min, -carrier)
ggplot(df_airline_copy, aes(x = min, fill = delay, color = delay)) + 
  geom_density(alpha = 0.3, aes(group = delay)) + 
  ggtitle("Density plot of carrier delay in mins") + 
  xlab("log delay in mins (x10^x)") + 
  ylab("Density")
## Warning: Removed 231974 rows containing non-finite values (`stat_density()`).

Output 1

In the above code we have first created a copy of data frame which was followed by taking the subset of the data frame. The log fuction was used to take log value which was followed by using gather function to reshape the data. Further, the results were plotted using density plot.

Task 2:

Generate correlation plots for arr_flights, arr_del15, arr_cancelled, arr_diverted, arr_delay, carrier_delay, weather_delay, nas_delay, security_delay and late_aircraft_delay.

#take the subset of the data frame
df_corr <- subset(df_airline, select = c('arr_flights', 'arr_del15', 'arr_cancelled', 'arr_diverted', 'arr_delay', 'carrier_delay', 'weather_delay', 'nas_delay', 'security_delay', 'late_aircraft_delay'))
# dropping the NA values from the data frame
df_corr<-df_corr %>% drop_na()
# get the correlation matrix of the data frame
x = cor(df_corr)
# plot the correlation plot of the correlation matrix of the data frame
corrplot.mixed(x, order = 'AOE')

### Output 2 In the code we have used subbset function to select the desried column which was followed by dropna() to remove na values.Further cor function has been used which returns values between -1 to 1.Finally we plotted our result using the corrplot.

Task 3

Based on your observations from the (Dataset: airlines_delay.csv), create any visualization of your choice

# Calculating the average arrival delay time by carrier
df_airline_mean <- aggregate(arr_delay ~ carrier, data = df_airline, FUN = mean, na.rm = TRUE)
colnames(df_airline_mean) <- c("carrier", "mean_arr_delay")

# Plot the average arrival delay time vs carrier
ggplot(df_airline_mean, aes(x = reorder(carrier, mean_arr_delay), y = mean_arr_delay)) +
  geom_bar(stat = "identity", fill = "#1E90FF") +
  labs(x = "Carrier", y = "Mean Arrival Delay Time (minutes)") +
  ggtitle("Mean Arrival Delay Time by Carrier") +
  theme(plot.title = element_text(hjust = 0.5))

### Output 3 In this code we have plotted the average arrival delay time by carrier. First we computed the mean using the aggreagte function. This was followed by plotting our results. We have also made sure to arrange the data for better readability.

Task 4

From (Dataset: wages_jobs.csv) generate a heat map similar to the one shown below. The variable Difference is defined as the difference between number of male employees and the number of female employees. A negative value indicates a greater number of female than male employees. In addition to the plot critique the below visualization:

# Loading the Dataset
df_wage_jobs <- read_csv("/Users/yashjaigude/Documents/MS NEU/IE 6600 Comp Viz/Homework 4/wages_jobs.csv")
## Rows: 50 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Gender, PUMS Occupation, Slug PUMS Occupation, PUMS Industry, Slug...
## dbl (10): ID Gender, ID Year, Year, Record Count, Average Wage, Average Wage...
## lgl  (2): ID Workforce Status, Workforce Status
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Renaming the Columns
names(df_wage_jobs)[names(df_wage_jobs) == "PUMS Occupation"] <- "PUMS_Occupation"
names(df_wage_jobs)[names(df_wage_jobs) == "Total Population"] <- "Total_Population"
names(df_wage_jobs)[names(df_wage_jobs) == "Average Wage"] <- "Average_Wage"


# Aggregating the data 
df_wage_jobs<- aggregate(`Total_Population` ~ Gender + Year + `PUMS_Occupation`+ `Average_Wage`, data = df_wage_jobs, FUN = sum)

df_wage_jobs_female <- subset(df_wage_jobs, Gender == 'Female')
# take the subset  gender - Male
df_wage_jobs_male <- subset(df_wage_jobs, Gender == 'Male')
# apply the inner join on both male and female dataframes by year and occupation
df_wage_jobs_diff <- inner_join(df_wage_jobs_female, df_wage_jobs_male, by = c('Year', 'PUMS_Occupation'))
# add a column of difference and store the value of total male minus total female
df_wage_jobs_diff$Difference <- df_wage_jobs_diff$'Total_Population.x' - df_wage_jobs_diff$'Total_Population.y'
# plot the graph based on the difference of male and female employees 
df_wage_jobs_diff$PUMS_Occupation <- factor(df_wage_jobs_diff$PUMS_Occupation)
ggplot(df_wage_jobs_diff, aes(x = Year, y = PUMS_Occupation, fill= Difference)) + 
  geom_tile(aes(width = 2L)) + 
  scale_fill_gradientn(colors = brewer.pal(8,"Spectral")) +
  ggtitle("Difference between number of \n Male and Female Employees") +
  ylab('Occupation')

### Output 4 In the above code we have renamed the columns first. Then we calulated the mean using aggreagte function in R. In the second stepo we have subset our data into Male and Female. Additionally, we utilized the “inner_join()” function to combine two separate data frames based on their shared “year” and “occupation” values and then calculated the population difference among male and female. To present our results we used ggplot() to createa graph and then assign stitable title and colour scheme using ggtitle() and scale_fill_gradientn() respectively.

Task 6

Create a stacked bar plot for the year 2018 using the axes of occupation and average wage and gender as the colour.

#Importing the wages_jobs dataset
df_wage_jobs <- read_csv("/Users/yashjaigude/Documents/MS NEU/IE 6600 Comp Viz/Homework 4/wages_jobs.csv") %>% drop_na()
## Rows: 50 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Gender, PUMS Occupation, Slug PUMS Occupation, PUMS Industry, Slug...
## dbl (10): ID Gender, ID Year, Year, Record Count, Average Wage, Average Wage...
## lgl  (2): ID Workforce Status, Workforce Status
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Renaming the Columns
names(df_wage_jobs)[names(df_wage_jobs) == "PUMS Occupation"] <- "PUMS_Occupation"
names(df_wage_jobs)[names(df_wage_jobs) == "Total Population"] <- "Total_Population"
names(df_wage_jobs)[names(df_wage_jobs) == "Average Wage"] <- "Average_Wage"

# Dividing Average_Wage by 1000 and rounding it to 2 decimal
(df_wage_jobs$Average_Wage <-  df_wage_jobs$Average_Wage / 1000)
##  [1]  47.77930  41.46703  47.28159  40.64636  39.29412  54.15103  62.09051
##  [8]  53.64722  50.71085  56.38115 139.13326 171.51134 162.72390 128.14924
## [15] 129.08595  97.70502 139.07618 112.79679  84.54759  71.72677  39.11358
## [22]  31.24402  36.73396  35.04949  33.70602  44.55357  48.77678  44.04465
## [29]  35.68942  38.49414  48.98592  40.85674  42.00891  37.52694  33.32234
## [36]  41.65994  42.39229  36.85245  29.59275  30.61145  71.02195  75.74209
## [43]  68.93716  62.98217  66.21928  73.58085  42.96998  58.96915  46.25241
## [50]  26.92701
df_wage_jobs$Average_Wage <- round(df_wage_jobs$Average_Wage, 2)

# Filtering dataset for the Year 2018
df_wage_jobs <- filter(df_wage_jobs, df_wage_jobs$Year == "2018")
df_wage_jobs
## # A tibble: 10 × 17
##    ID Gen…¹ Gender ID Ye…²  Year ID Wo…³ Workf…⁴ Recor…⁵ Avera…⁶ Avera…⁷ Total…⁸
##       <dbl> <chr>    <dbl> <dbl> <lgl>   <lgl>     <dbl>   <dbl>   <dbl>   <dbl>
##  1        2 Female    2018  2018 TRUE    TRUE        901    47.8   2448.   92407
##  2        1 Male      2018  2018 TRUE    TRUE        226    62.1   8822.   25688
##  3        1 Male      2018  2018 TRUE    TRUE       1083   172.   10612.  102787
##  4        2 Female    2018  2018 TRUE    TRUE         67   113.   29083.    6828
##  5        2 Female    2018  2018 TRUE    TRUE        359    39.1   3045.   46422
##  6        1 Male      2018  2018 TRUE    TRUE        150    48.8   8798.   18408
##  7        1 Male      2018  2018 TRUE    TRUE        255    49.0   6748.   29936
##  8        2 Female    2018  2018 TRUE    TRUE        196    42.4   7921.   22927
##  9        1 Male      2018  2018 TRUE    TRUE        428    75.7   4849.   46963
## 10        2 Female    2018  2018 TRUE    TRUE         14    59.0  16157.    1262
## # … with 7 more variables: `Total Population MOE Appx` <dbl>,
## #   PUMS_Occupation <chr>, `ID PUMS Occupation` <dbl>,
## #   `Slug PUMS Occupation` <chr>, `PUMS Industry` <chr>,
## #   `ID PUMS Industry` <dbl>, `Slug PUMS Industry` <chr>, and abbreviated
## #   variable names ¹​`ID Gender`, ²​`ID Year`, ³​`ID Workforce Status`,
## #   ⁴​`Workforce Status`, ⁵​`Record Count`, ⁶​Average_Wage,
## #   ⁷​`Average Wage Appx MOE`, ⁸​Total_Population
# Plotting the stacked bar plot
ggplot(data = df_wage_jobs, aes(x = PUMS_Occupation, y = Average_Wage, fill = Gender)) + geom_bar(stat = "identity", position = "stack") + xlab("Occupation") + ylab("Average Wage") + ggtitle("Stacked Bar Plot for Year 2018 (Scale on Y axis - 1:1000)") + theme_minimal() + theme(axis.text.x = element_text(angle=90, hjust = 1), plot.title = element_text(hjust = 0.5)) + ylim(0, 300) + geom_text(aes(label = Average_Wage), size = 3, hjust = 0.5, vjust = 3, position ="stack")

Output 6

After importing df_wage_jobs dataset, we have cleaned it by removing na values by drop_na() and renamed the column names for feasibility. Divided average_wage column by 1000 to visualize the labels effectively. Mentioned the scale as 1:1000. Filtered dataset for the Year 2018. Using ggplot, plotted the stacked bar graph with geom_bar() and formatted the plot by using geom_text(), ylim() and theme() functions.

Task 7

Create a tree map from occupations dataset. The area of each rectangle should be proportional to the number of people working in that Detailed Occupation.

#Importing occupations dataset
df_occupations <- read_csv("/Users/yashjaigude/Documents/MS NEU/IE 6600 Comp Viz/Homework 4/occupations.csv")
## Rows: 646 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (15): ID Major Occupation Group, Major Occupation Group, ID Minor Occupa...
## dbl  (8): ID Year, Year, ID Industry Sub-Sector, Total Population, Total Pop...
## lgl  (2): ID Workforce Status, Workforce Status
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Renaming the Columns
names(df_occupations)[names(df_occupations) == "Major Occupation Group"] <- "Major_Occupation_Group"
names(df_occupations)[names(df_occupations) == "Detailed Occupation"] <- "Detailed_Occupation"
names(df_occupations)[names(df_occupations) == "Total Population"] <- "Total_Population"


#Filtering for the year 2018 and using grouped data by Major_Occupation_Group and Detailed_Occupation
occupations_filtered <- df_occupations %>% filter(Year=='2018') %>% group_by(Major_Occupation_Group, Detailed_Occupation) %>% summarise(total = Total_Population, .groups = NULL)
## `summarise()` has grouped output by 'Major_Occupation_Group'. You can override
## using the `.groups` argument.
# Creating treemap by geom_treemap from ggplot by considering quantitative variable as Total Population
tree_map <- ggplot(occupations_filtered, aes(area = total, label = Detailed_Occupation, fill=Major_Occupation_Group, subgroup=Major_Occupation_Group)) + geom_treemap() + ggtitle("Workforce Distribution by Detailed Occupation for 2018") + geom_treemap_text(fontface = 'bold', colour = "white", place = "centre", grow = TRUE)

tree_map

Output 7

Imported df_occupations dataset and renamed the columns. Filtered the dataset for the year 2018 and grouped by Major_Occupation_Group and Detailed_Occupation variables. Created treemap by geom_treemap() from ggplot and formatted the plot using geom_treemap_text() and ggtitle().

Task 8

Explore Plotly in R here and create any chart of your choice from any of the datasets provided in this homework.

# Query the columns to plot
req_col <- c("carrier","year","carrier_delay")

df_airline_req <- df_airline[,req_col] %>%
mutate(across(!carrier, ~replace_na(.x, 0)))

# Aggregate carrier delay column by carriers available
df_airline_arr <- aggregate(df_airline_req$carrier_delay, by=list(carrier=df_airline_req$carrier), FUN = sum) %>%
arrange(desc(x))
df_airline_top5 <- head(df_airline_arr, 5)

# Grouped data by year and carriers 
df_aggregate_year <- df_airline_req %>%
group_by(year,carrier) %>%
summarise(Total = sum(carrier_delay)) 
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Used plotly to create a plot between Total carrier delay and Year grouped by carrier
df_airline_comb <- inner_join(df_aggregate_year, df_airline_top5, by = "carrier")
df_plot <- plot_ly(data = df_airline_comb, x = ~year, y = ~Total, color = ~carrier,mode='line') %>% layout(title = 'Airlines Carrier Delays with Year ', plot_bgcolor = "grey", xaxis = list(title = 'Year'), yaxis = list(title = 'Total Carrier Delays'))
df_plot
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

Output 8

Aggregated carrier delays by carriers available in dataset. Grouped data by Year and carriers. Used plotly to create a plot between Total carrier delay and Year grouped by carriers available. Plot labels for X and Y axis are given by layout().